Load required libraries
rm(list=ls())
library(ggplot2)
library(dplyr)
library(tidyr)
library(RMySQL)
library(stringr)
library(pcaPP)
library(directlabels)
library(proto)
Load in wordbank data
## OPEN DATABASE CONNECTION ##
wordbank <- src_mysql(dbname="wordbank", host="54.149.39.46",
user="wordbank", password="wordbank")
## NOW LOAD TABLES ##
source.table <- tbl(wordbank, "common_source")
admin.table <- tbl(wordbank, "common_administration")
child.table <- tbl(wordbank, "common_child")
wordmapping.table <- tbl(wordbank, "common_wordmapping")
instruments.table <- tbl(wordbank, "common_instrumentsmap")
english.ws.table <- tbl(wordbank, "instruments_english_ws")
spanish.ws.table <- tbl(wordbank, "instruments_spanish_ws")
norwegian.ws.table <- tbl(wordbank, "instruments_norwegian_ws")
danish.ws.table <- tbl(wordbank, "instruments_danish_ws")
Get kid data and put together.
# Get administration info
admins <- admin.table %>%
select(data_id,child_id,age,source_id) %>%
rename(id = data_id, child.id = child_id, source.id = source_id)
admins <- as.data.frame(admins)
# Get demographic variables for each child
demos <- select(child.table,id,sex,mom_ed,birth_order) %>%
rename(child.id = id) # Rename id fields
demos <- as.data.frame(demos)
# Join age and demographics together
child.data <- as.tbl(left_join(admins,demos))
Set up mappings and instruments.
mapping <- as.data.frame(wordmapping.table)
instruments <- as.data.frame(instruments.table) %>%
rename(instrument_id = id)
items <- left_join(mapping, instruments)
Fucntion for getting all of the data in wordbank for a given language (kid x item).
get.language.data <- function(lang.table, lang.items, lang, child.data) {
instrument.items <- lang.items %>%
filter(language == lang, form == 'WS') %>%
select(item, type, category, lexical_category, definition) %>%
mutate(item = str_replace(item, "\\.", "_")) # Fix _/. inconsistencies
instrument.data <- as.data.frame(lang.table) %>%
rename(id = basetable_ptr_id) %>% # Rename the id
gather(item, value, -id) %>% # Arrange in longform
mutate(item = str_replace(item, "item_", "")) # Strip off item_
d <- left_join(instrument.data, instrument.items)
d <- left_join(d, child.data)
}
Get (kid x item) data for all languages.
d.english <- get.language.data(lang.table=english.ws.table,
lang.items=items,
lang="English",
child.data)
d.spanish <- get.language.data(lang.table=spanish.ws.table,
lang.items=items,
lang="Spanish",
child.data)
d.norwegian <- get.language.data(lang.table=norwegian.ws.table,
lang.items=items,
lang="Norwegian",
child.data)
# Norwegian data is loaded in funny -- NAs in wordform are actually 0s
d.norwegian[d.norwegian$type %in% c("word_form","word")
& is.na(d.norwegian$value),]$value = ""
d.danish <- get.language.data(lang.table=danish.ws.table,
lang.items=items,
lang="Danish",
child.data)
# Danish data is loaded in funny -- NAs in wordform are actually 0s
d.danish[d.danish$type %in% c("word_form","word")
& is.na(d.danish$value),]$value = ""
Function for getting vocab size data.
language.vocab.sizes <- function(lang.data) {
d.vocab <- lang.data %>%
filter(type == "word") %>%
group_by(age,id) %>%
summarise(vocab.sum = sum(value == "produces", na.rm=TRUE),
vocab.mean = vocab.sum/length(value))
return(d.vocab)
}
Function for getting (kid x {vocab size, syntax score, morphology score}) data.
summarise.language.data <- function(lang.data,lang) {
d.vocab <- language.vocab.sizes(lang.data)
d.complexity <- lang.data %>%
filter(type == "complexity") %>%
group_by(id) %>%
summarise(all.na = all(is.na(value)),
complexity.sum = sum(value == "complex",
na.rm=TRUE) / length(value)) %>%
mutate(complexity = ifelse(all.na,NA,complexity.sum)) %>%
select(-all.na,-complexity.sum) # Deals with ifelse
# forcing values to logical
d.wordform <- lang.data %>%
filter(type == "word_form") %>%
group_by(id) %>%
summarise(all.na = all(is.na(value)),
wordform.sum = sum(value == "produces",
na.rm=TRUE) / length(value)) %>%
mutate(wordform = ifelse(all.na,NA,wordform.sum)) %>%
select(-all.na,-wordform.sum) # Deals with ifelse
# forcing values to logical
# Spanish doesn't have ending data, so its skipped, at least for now.
# d.ending <- d %>%
# filter(type %in% c("ending")) %>%
# group_by(id) %>%
# summarise(ending_sometimes = mean(value == "sometimes" |
# value == "often",
# na.rm=TRUE),
# ending_often = mean(value == "often",
# na.rm=TRUE))
# d.composite <- left_join(d.composite, d.ending)
d.composite <- left_join(d.vocab, d.complexity)
d.composite <- left_join(d.composite, d.wordform) %>%
ungroup() %>%
filter(age > 15 & age < 32) %>%
mutate(age.group = cut(age, breaks = c(15, 19, 23, 27, 31)),
age.bin = cut(age, quantile(age), include.lowest=T))
d.composite$age.bin <- factor(d.composite$age.bin, labels=c(1,2,3,4))
# %>%
# filter(num.complexity.na == 0) %>%
# select(-num.complexity.na)
#
d.composite$language <- lang
return(d.composite)
}
Get (kid x {vocab size, syntax score, morphology score}) data for all languages and aggregate them.
summary.english <- summarise.language.data(d.english,"English")
summary.spanish <- summarise.language.data(d.spanish,"Spanish")
summary.norwegian <- summarise.language.data(d.norwegian,"Norwegian")
summary.danish <- summarise.language.data(d.danish,"Danish")
summary.data <- rbind_list(summary.english, summary.spanish,
summary.norwegian, summary.danish) %>%
mutate(language = factor(language, levels=c("English", "Spanish",
"Norwegian", "Danish")))
# gather for plotting
ms <- summary.data %>% gather(measure, score, complexity:wordform) %>%
mutate(measure = factor(measure, levels = c("wordform","complexity"),
labels = c("Word Form", "Complexity")))
#ms %>%
# group_by(language, age.bin) %>%
# summarise(n = n())
Using Age and Vocab to predict Morphology and Syntax Scores.
#quartz(width=8,height=7.5)
#ggplot(ms, aes(x = vocab.mean, y = score, colour = age.group, fill = age.group,
# label = age.group)) +
ggplot(ms, aes(x = vocab.mean, y = score, colour = age.bin, fill = age.bin,
label = age.bin)) +
#geom_point(alpha=.5, size=.8) +
geom_jitter(alpha=.6, ssize=.8) +
geom_smooth(method="lm", formula = y ~ I(x^2) - 1) +
facet_grid(language~measure) +
scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.1),
name = "Vocabulary Size") +
scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
"Score (Mean Items)") +
theme_bw(base_size = 14) +
scale_color_brewer(palette="Set1") +
scale_fill_brewer(palette="Set1")
Using Morphology scores to predict Syntax scores.
#quartz(width=8,height=7.5)
ggplot(summary.data,aes(x = wordform, y = complexity, fill=age.group,colour=age.group,
label=age.group)) +
facet_wrap( ~ language) +
geom_jitter(size=1)+
geom_smooth(method="lm", formula = y ~ exp(x) - 1) +
scale_x_continuous(limits = c(0,1.05), breaks=seq(0,1,.2),
name = "Morphology Score") +
scale_y_continuous(limits = c(0,1.05), breaks=seq(0,1,.2),"Syntax Score") +
scale_color_brewer(palette="Set1") +
scale_fill_brewer(palette="Set1") +
theme_bw(base_size = 14)
Examine relationship between vocab size, age, and syntax/morphology scores.
# fit regressions to data
english.lm.grp <- lm(score ~ I((vocab.mean*100)^2) * age.group * measure - age.group - measure - 1,
data=filter(ms,language=="English"))
spanish.lm.grp <- lm(score ~ I((vocab.mean*100)^2) * age.group * measure - age.group - measure - 1,
data=filter(ms,language=="Spanish"))
norwegian.lm.grp <- lm(score ~ I((vocab.mean*100)^2) * age.group * measure - age.group - measure - 1,
data=filter(ms,language=="Norwegian"))
danish.lm.grp <- lm(score ~ I((vocab.mean*100)^2) * age.group * measure - age.group - measure - 1,
data=filter(ms,language=="Danish"))
english.lm.bin <- lm(score ~ I((vocab.mean*100)^2) * age.bin * measure - age.bin - measure - 1,
data=filter(ms,language=="English"))
spanish.lm.bin <- lm(score ~ I((vocab.mean*100)^2) * age.bin * measure - age.bin - measure - 1,
data=filter(ms,language=="Spanish"))
norwegian.lm.bin <- lm(score ~ I((vocab.mean*100)^2) * age.bin * measure - age.bin - measure - 1,
data=filter(ms,language=="Norwegian"))
danish.lm.bin <- lm(score ~ I((vocab.mean*100)^2) * age.bin * measure - age.bin - measure - 1,
data=filter(ms,language=="Danish"))
predicted.data <- as.data.frame(ms)
predicted.data$predicted.grp <- NA
predicted.data$predicted.bin <- NA
predicted.data[predicted.data$language=="English",]$predicted.grp <-
predict.lm(english.lm.grp, predicted.data[predicted.data$language=="English",])
predicted.data[predicted.data$language=="Spanish",]$predicted.grp <-
predict.lm(spanish.lm.grp, predicted.data[predicted.data$language=="Spanish",])
predicted.data[predicted.data$language=="Norwegian",]$predicted.grp <-
predict.lm(norwegian.lm.grp, predicted.data[predicted.data$language=="Norwegian",])
predicted.data[predicted.data$language=="Danish",]$predicted.grp <-
predict.lm(danish.lm.grp, predicted.data[predicted.data$language=="Danish",])
predicted.data[predicted.data$language=="English",]$predicted.bin <-
predict.lm(english.lm.bin, predicted.data[predicted.data$language=="English",])
predicted.data[predicted.data$language=="Spanish",]$predicted.bin <-
predict.lm(spanish.lm.bin, predicted.data[predicted.data$language=="Spanish",])
predicted.data[predicted.data$language=="Norwegian",]$predicted.bin <-
predict.lm(norwegian.lm.bin, predicted.data[predicted.data$language=="Norwegian",])
predicted.data[predicted.data$language=="Danish",]$predicted.bin <-
predict.lm(danish.lm.bin, predicted.data[predicted.data$language=="Danish",])
Replot original correlation with fitted model.
#quartz(width=6,height=6)
ggplot(predicted.data, aes(x = vocab.mean, y = score,
colour = age.group, fill = age.group,
label = age.group)) +
geom_jitter(alpha=.6, size=.8, pch="o") +
geom_line(aes(y=predicted.grp),size=0.8) +
# facet_grid(measure~language) +
facet_grid(language~measure) +
scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.2),
name = "Vocabulary Size") +
scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
"Score (Mean Items)") +
theme_bw(base_size = 11) +
theme(legend.position = "bottom") +
scale_color_brewer(palette="Set1",
name="Age Group (months)") +
# labels=c("16-19","20-23","24-27","28-31")) +
scale_fill_brewer(palette="Set1",
guide=FALSE)
#ggsave(file=("grammar.pdf"), width=6, height=6)
Same plot but with equal number of kids age bins.
#quartz(width=6,height=6)
ggplot(predicted.data, aes(x = vocab.mean, y = score,
colour = age.bin, fill = age.bin,
label = age.bin)) +
geom_jitter(alpha=.6, size=.8, pch="o") +
geom_line(aes(y=predicted.bin),size=0.8) +
# facet_grid(measure~language) +
facet_grid(language~measure) +
scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.2),
name = "Vocabulary Size") +
scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
"Score (Mean Items)") +
theme_bw(base_size = 11) +
theme(legend.position = "bottom") +
scale_color_brewer(palette="Set1",
name="Age Group (months)") +
# labels=c("16-19","20-23","24-27","28-31")) +
scale_fill_brewer(palette="Set1",
guide=FALSE)
#ggsave(file=("grammar.pdf"), width=6, height=6)
#plot.measure <- function(meas) {
#plot <- ggplot(filter(predicted.data, measure==meas),
# aes(x = vocab.mean, y = score,
# colour = age.group, fill = age.group,
# label = age.group)) +
# geom_jitter(alpha=.6,size=.8, pch="o") +
# geom_line(aes(y=predicted.grp),size=0.9) +
# facet_grid(language~.) +
# scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.1),
# name = "Vocabulary Size") +
# scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
# paste(meas, "Score (Mean Items)")) +
# theme_bw(base_size = 11) +
# scale_color_brewer(palette="Set1",
# name="Age Group\n (months)",
# labels=c("16-19","20-23","24-27","28-31")) +
# scale_fill_brewer(palette="Set1",
# guide=FALSE) +
# theme(legend.position = "bottom",
# legend.background = element_rect(colour = "grey"))
#return(plot)
#}
#quartz(width=4,height=7)
#plot.measure("Complexity") %>%
#ggsave(file=("complexity.pdf"), width=4, height=7)
#quartz(width=4,height=7)
#plot.measure("Word Form") %>%
#ggsave(file=("wordform.png"), width=4, height=7)
Compute (vocab x age) interaction coefficients for each measure and language.
mod.coef.fun <- function(score, vocab.mean, age) {
return(coef(lm(score ~ I((vocab.mean*100)^2)*age+0 - age))[2])
}
mod.se.fun <- function(score, vocab.mean, age) {
return(summary(lm(score ~ I((vocab.mean*100)^2)*age+0 - age))$coefficients[2,2])
}
coefs <- ms %>%
group_by(language, measure) %>%
summarise(coef = mod.coef.fun(score,vocab.mean,age),
se = mod.se.fun(score,vocab.mean,age))
Plot coefficients for each language.
#quartz(width=6, height=4)
ggplot(coefs,
aes(x=language, y=coef, fill=measure)) +
geom_bar(position="dodge", stat="identity") +
geom_linerange(aes(ymin=coef-se, ymax=coef+se),
position = position_dodge(width=.9)) +
ylab("Age interaction coefficient") +
xlab("Language") +
theme(legend.position = "bottom") +
scale_fill_brewer(palette = "Set1",
name="")
#ggsave(file=("coeffs.pdf"), width=6, height=4)
Get (kid x item x {vocab size, value}) data for wordform and complexity items.
grammar.by.item <- function(lang.data, lang) {
d.vocab <- language.vocab.sizes(lang.data)
d.complexity <- lang.data %>%
filter(type == "complexity") %>%
mutate(value = value == "complex") %>%
select(-lexical_category, -category)
d.wordform <- lang.data %>%
filter(type == "word_form") %>%
mutate(value = value == "produces") %>%
select(-lexical_category, -category)
d.data <- rbind(d.complexity, d.wordform)
d.data <- left_join(d.vocab, d.data) %>%
filter(age > 15, age < 33)
return(d.data)
}
english.grammar.by.item <- grammar.by.item(d.english,"English")
spanish.grammar.by.item <- grammar.by.item(d.spanish,"Spanish")
norwegian.grammar.by.item <- grammar.by.item(d.norwegian,"Norwegian")
danish.grammar.by.item <- grammar.by.item(d.danish,"Danish")
Compute (vocab x age) interaction terms for each wordform and complexity item.
#compute interaction terms each item
i.terms.function <- function(data,x) {
return(summary(glm(value ~ I((vocab.mean*100)^2)*age+0,
data=filter(data,definition==x),
family="binomial"))$coefficients[3,3])
}
complexity.diff <- function(item) {
if(length(grep("/", item)) == 0) {return(item)}
else{
phrases <- str_split(item," / ")[[1]]
first.phrase <- str_split(phrases[1], " ")[[1]]
second.phrase <- str_split(phrases[2], " ")[[1]]
first.diff <- setdiff(first.phrase,second.phrase)
second.diff <- setdiff(second.phrase,first.phrase)
if(length(first.diff)==0) return(paste(second.diff,collapse=" "))
else if(length(second.diff)==0) return(paste(first.diff,collapse=" "))
else{first.phrase = paste(first.diff, collapse =" ")
second.phrase = paste(second.diff,collapse = " ")
return(paste(first.phrase, second.phrase ,sep=" / "))}
}
}
lang.interaction.terms<- function(grammar.by.item) {
complexity.terms <- sapply(unique(filter(grammar.by.item,
type=="complexity")$definition),
function(item) i.terms.function(grammar.by.item,
item))
complexity.terms <- data.frame(type = "complexity",
definition=names(complexity.terms),
item=1:length(complexity.terms),
term = complexity.terms,
row.names=NULL) %>%
mutate(definition= sapply(definition,complexity.diff),
item = paste(item,definition,sep=". ")) %>%
select(-definition)
wordform.terms <- sapply(unique(filter(grammar.by.item,
type=="word_form")$definition),
function(item) i.terms.function(grammar.by.item,
item))
wordform.terms <- data.frame(type = "wordform",
item=names(wordform.terms),
term = wordform.terms,
row.names=NULL)
#rename results to be human-readable
interaction.terms <-rbind(complexity.terms,wordform.terms) %>%
arrange(term) %>%
mutate(item = factor(item,levels=item))
return(interaction.terms)
}
spanish.grammar.by.item$definition <- spanish.grammar.by.item$item
norwegian.grammar.by.item$definition <- norwegian.grammar.by.item$item
danish.grammar.by.item$definition <- danish.grammar.by.item$item
english.interaction.terms <- lang.interaction.terms(english.grammar.by.item)
spanish.interaction.terms <-lang.interaction.terms(spanish.grammar.by.item)
norwegian.interaction.terms <- lang.interaction.terms(norwegian.grammar.by.item)
danish.interaction.terms <- lang.interaction.terms(danish.grammar.by.item)
Plot interaction terms by item for each language.
interaction.plot <- function(lang.interaction.terms, lang) {
plt <- ggplot(lang.interaction.terms,
aes(x=item,y=term,fill=type,label=item)) +
geom_bar(stat="identity", position="identity", alpha=.5) +
geom_text(y=0.15, angle=90, hjust=0, size=3.5) +
theme_bw(base_size = 14) +
scale_y_continuous(name="Age x Vocabulary Interaction Z-score") +
# limits=c(-10,20),
# breaks=seq(-10,20,2.5)) +
scale_x_discrete(name="CDI Item",breaks=NULL) +
scale_fill_brewer(palette="Set1") +
scale_colour_brewer(palette="Set1") +
theme(legend.position="bottom") +
ggtitle(lang)
return(plt)
}
interaction.plot(english.interaction.terms, "English") #%>%
#ggsave(file=("english_interactions.png"), width=10, height=5)
interaction.plot(spanish.interaction.terms, "Spanish") #%>%
#ggsave(file=("spanish_interactions.png"), width=10, height=5)
interaction.plot(norwegian.interaction.terms, "Norwegian") #%>%
#ggsave(file=("norwegian_interactions.png"), width=10, height=5)
interaction.plot(danish.interaction.terms, "Danish") #$>$
#ggsave(file=("danish_interactions.png"), width=10, height=5)
Function for computing vocabulary composition for each speaker of a language.
vocab.composition <- function(lang.data,lang) {
d.vocab <- language.vocab.sizes(lang.data)
d.cat <- lang.data %>%
filter(type == "word") %>%
group_by(id,lexical_category) %>%
summarise(cat = sum(value == "produces", na.rm=TRUE))
d.vocab.comp <- left_join(d.vocab, d.cat) %>%
mutate(prop = cat / vocab.sum)
d.vocab.comp$language = lang
return(d.vocab.comp)
}
Function for computing CDI form composition for all languages.
lang.vocab.composition <- function(lang.items) {
lang.words <- lang.items %>%
filter(form == "WS",type=="word")
lang.num.total <- lang.words %>%
group_by(language) %>%
summarise(n = n())
lang.vocab.comp <- lang.words %>%
group_by(language,lexical_category) %>%
summarise(num.per.cat = n())
lang.vocab.comp <- left_join(lang.vocab.comp, lang.num.total) %>%
mutate(prop.per.cat = num.per.cat/n)
return(lang.vocab.comp)
}
Get vocabulary composition data for all languages.
# get form compositions
lang.vocab.comp <- lang.vocab.composition(items) %>%
ungroup() %>%
mutate(language = factor(language,
levels = c("English", "Spanish",
"Norwegian", "Danish")),
lexical_category = factor(lexical_category,
levels=c("nouns","predicates",
"function_words","other"),
labels=c("Nouns", "Predicates",
"Function Words","Other"))) %>%
filter(lexical_category != "Other")
# get data for kids in each language
vocab.comp.english <- vocab.composition(d.english,"English")
vocab.comp.spanish <- vocab.composition(d.spanish,"Spanish")
vocab.comp.norwegian <- vocab.composition(d.norwegian,"Norwegian")
vocab.comp.danish <- vocab.composition(d.danish,"Danish")
# aggregate data for all languages together
summary.vocab.comp <- rbind_list(vocab.comp.english,vocab.comp.spanish,
vocab.comp.norwegian,vocab.comp.danish) %>%
# filter(age > 15 & age < 33) %>%
mutate(#age.group = cut(age, breaks = c(15, 20, 24, 28, 32)),
language = factor(language,
levels = c("English", "Spanish",
"Norwegian", "Danish")),
lexical_category = factor(lexical_category,
levels = c("nouns", "predicates",
"function_words", "other"),
labels = c("Nouns", "Predicates",
"Function Words", "Other")))
summary.vocab.comp <- left_join(ms, summary.vocab.comp)
Plot vocabulary composition by language.
ggplot(filter(summary.vocab.comp,lexical_category != "Other"),
aes(x=vocab.mean, y=prop, colour=lexical_category,
shape = lexical_category, fill = lexical_category,
label=lexical_category)) +
geom_point(size = 1, alpha = 0.25) +
facet_wrap(~ language) +
geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
linetype="dashed", color="grey") + #baselines for each language
geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
scale_y_continuous(name = "Proportion of total vocabulary") +
scale_x_continuous(name = "Vocabulary Size") +
geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
theme_bw(base_size=14) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1")+
theme(axis.text.x = element_text(angle=-40, hjust = 0),
axis.title.y = element_text(vjust=0.35),
axis.title.x = element_text(vjust=-0.5),
legend.position="none")
Plot vocabulary composition by language and age group.
ggplot(filter(summary.vocab.comp,lexical_category != "Other"),
aes(x=vocab.mean, y=prop, colour=lexical_category,
shape = lexical_category, fill = lexical_category,
label = lexical_category)) +
geom_jitter(size = 1, alpha = 0.5) +
facet_grid(language ~ age.group) +
geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
linetype="dashed", color="grey") + #baselines for each language
geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
scale_y_continuous(name = "Proportion of total vocabulary") +
scale_x_continuous(name = "Vocabulary Size") +
geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
theme_bw(base_size=14) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1")+
theme(axis.text.x = element_text(angle=-40, hjust = 0),
axis.title.y = element_text(vjust=0.35),
axis.title.x = element_text(vjust=-0.5),
legend.position="none")
Plot vocabulary composition by language, split by age group.
#quartz(width=7, height=6)
ggplot(filter(summary.vocab.comp, lexical_category != "Other"),
aes(x=vocab.mean, y=prop, colour=age.group, linetype = lexical_category)) +
# geom_jitter(size = 1, alpha = 0.5) +
facet_wrap(~ language) +
geom_hline(data=lang.vocab.comp,
aes(yintercept=prop.per.cat),
linetype="dashed", color="grey") + #baselines for each language
geom_smooth(method='loess', span=.5) +
scale_x_continuous(limits = c(0, 1), breaks = seq(0,1,.2),
name = "Vocabulary Size") +
scale_y_continuous(limits = c(0, .6), breaks = seq(0,1,.2),
"Proportion of total vocabulary") +
theme_bw(base_size=14) +
scale_color_brewer(palette = "Set1", name = "Age Group (months)") +
scale_fill_brewer(palette = "Set1") +
scale_linetype(name = "Lexical Category") +
theme(#axis.text.x = element_text(angle=-40, hjust = 0),
# axis.title.y = element_text(vjust=0.35),
# axis.title.x = element_text(vjust=-0.5),
legend.position="bottom")
#ggsave(file=("age_composition.pdf"), width=7, height=6)
Plot vocabulary composition by language and lexical category.
#quartz(width=8, height=6)
ggplot(filter(summary.vocab.comp, lexical_category != "Other"),
aes(x=vocab.mean, y=prop, colour = age.group, fill = age.group)) +
# geom_jitter(size = 1, alpha = 0.5) +
facet_grid(language ~ lexical_category) +
geom_hline(data=lang.vocab.comp,
aes(yintercept=prop.per.cat),
linetype="dashed", color="grey") + #baselines for each language
geom_smooth(aes(group=age.group), method='loess', span=0.5) +
scale_y_continuous(name = "Proportion of total vocabulary") +
scale_x_continuous(name = "Vocabulary Size") +
theme_bw(base_size=14) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1")+
theme(axis.text.x = element_text(angle=-40, hjust = 0),
axis.title.y = element_text(vjust=0.35),
axis.title.x = element_text(vjust=-0.5),
legend.position="none")
#ggsave(file=("age_composition.pdf"), width=8, height=6)
Re-compute vocabulary composition data as a proportion of items on the CDI rather than a proportion of vocabulary size.
prop.comp <- left_join(filter(summary.vocab.comp, lexical_category != "Other"),
lang.vocab.comp) %>%
mutate(#lexical_category = factor(lexical_category,
# levels = c("Nouns", "Predicates",
# "Function Words", "Other")),
cdi.prop = cat / num.per.cat) %>%
select(-prop, -prop.per.cat, -n, -num.per.cat)
#means.comp <- left_join(ms, summary.comp)
Use Age and Lexical Category Score to predict Morphology and Syntax Scores, for each lexical category.
plot.vocab.comp.prediction <- function(category){
p <- ggplot(filter(prop.comp, lexical_category==category),
aes(x = cdi.prop, y = score, colour = age.group, fill = age.group,
label = age.group)) +
#geom_point(alpha=.5, size=.8) +
geom_jitter(alpha=.5,size=.8) +
geom_smooth(method="lm", formula = y ~ I(x^2) - 1) +
facet_grid(language ~ measure) +
scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.1),
name = "Category Size") +
scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
"Score (Mean Items)") +
theme_bw(base_size = 14) +
ggtitle(category) +
scale_color_brewer(palette="Set1") +
scale_fill_brewer(palette="Set1")
return(p)
}
plot.vocab.comp.prediction("Nouns")
plot.vocab.comp.prediction("Predicates")
plot.vocab.comp.prediction("Function Words")
Plot new vocabulary composition by language.
#quartz(width=8, height=6)
ggplot(filter(prop.comp, lexical_category != "Other"),
aes(x=vocab.mean, y=cdi.prop, colour=lexical_category,
shape=lexical_category, fill=lexical_category,
label=lexical_category)) +
geom_jitter(size=0.7, alpha=0.2, pch="o") +
facet_wrap(~ language) +
# geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
# linetype="dashed", color="grey") + #baselines for each language
geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
scale_y_continuous(name = "Proportion of CDI Category") +
scale_x_continuous(name = "Vocabulary Size") +
# geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
theme_bw(base_size=14) +
scale_color_brewer(palette = "Set1", name = "Lexical Category") +
scale_fill_brewer(palette = "Set1", guide = FALSE) +
scale_shape_discrete(guide = FALSE) +
theme(axis.text.x = element_text(angle=-40, hjust = 0),
axis.title.y = element_text(vjust=0.35),
axis.title.x = element_text(vjust=-0.5),
legend.position="bottom")
#ggsave(file=("composition.pdf"), width=8, height=6)